# global default settings for chunks
knitr::opts_chunk$set( eval = TRUE, warning = FALSE, message = FALSE,
fig.dim = c(6, 3)
)
# loaded packages; placed here to be able to load global settings
#install.packages(c("kableExtra", "patchwork","DescTools","tidyverse","tseries","DataCombine","forecast","plotly","ggpubr","gridExtra","pastecs","RColorBrewer", "xts"))
#install.packages(c( "tidyverse", "fpp3","GGally", "sugrrants"))
#install.packages("fabletools",repos = "https://tidyverts.org")
#install.packages("tidyverts")
Packages <- c("tidyverse", "arsenal", "readxl",
"patchwork", "GGally", "ghibli",
"plotly","ggpubr","gridExtra",
"pastecs","DataCombine",
"tseries","kableExtra","viridis",
"RColorBrewer", "xts", "fabletools",
"fpp3","tsibble", "formattable", "forecast")
invisible(lapply(Packages, library, character.only = TRUE))
# theme global setting for ggplot
theme_set(theme_minimal() +
theme(legend.position = "bottom") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.subtitle = element_text(hjust = 0.5, size = 8))
)
This dataset contains information for years 2004 - 2019, however only 2008 onward will be analyzed.
#Here I am loading in the data.
skbt<- read.csv("./data/GtrendData/bleachingTime08_18.csv", header = T, sep = ",",stringsAsFactors = F, col.names = c("time","rsv")) %>%
#deleting first row that contains duplicate row name by using one of the column names
filter(time != "Month") %>%
#formating columbs to correct class and separating time varaiable to year and month
mutate(rsv = as.numeric(rsv)) %>%
rename(
date = time) %>%
separate(date, into = c("year","month"), sep = "\\-" ) %>%
mutate(
month = as.numeric(month),
month_name = month.abb[month] %>% as_factor(),
year_f = as.factor(year),
id = c(1:132)) %>%
arrange(year) %>%
select(id, year, year_f,month_name, month, rsv)
Relative search interest is right skewed. Given that it is already on a relative scale it may be best leave it untransfromed for preliminary analysis.
skbt %>%
ggplot(aes(x = rsv))+
geom_histogram()+
labs(title = "Distribution of Relative Search Volume between 2008 - 2018")
#not neccessary. ####################
skbt %>%
pivot_wider(
names_from = month_name,
values_from = rsv
) %>%
as_tibble()
###############################
We are interested in understanding how the data changes overtime. Here we see the distribution of RSV by month. Very clearly we observe that their may be some seasonality where the highest RSV in every year may be in the summer months of June and July.
skbt %>%
mutate(month_name = forcats::fct_relevel(month_name,"Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")) %>%
ggplot(aes(x = month_name, y = rsv))+
geom_violin()+
ggtitle("Distibution of Relative Search Volume By Month for years 2008 - 2018")
The temporal trend is both represented by year and month where the colors on both graphs correspond with the same year and month that the observation occured. Ex: The peak in 2009 from the trend plot is directly associated with the peak in month July from the seasonal plot.
#plot of rsv by year
b0<- skbt %>%
mutate(month_name = forcats::fct_relevel(month_name, c("Jan", "Feb","Mar","Apr",
"May","Jun","Jul","Aug",
"Sep","Oct","Nov","Dec")),
month_cont = c(1:132)) %>%
ggplot(aes(x = month_cont, y = rsv, col = year_f))+geom_line()+
scale_fill_viridis_d(option = "d", aesthetics = "col", name = "Year Color")+theme(legend.position = "bottom", axis.text.x = element_blank())+
labs(
title = "SKin Bleaching Trend from 2008 To 2018",
x = "Year",
y = "Relative Search Volume"
) + theme_minimal()
#plot of rsv by month
b1<- skbt %>%
mutate(
month_name = forcats::fct_relevel(month_name, "Jan", "Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),
month_cont = c(1:132)) %>%
ggplot(aes(x=month_name, y= rsv, col = year_f, group = year
))+geom_line()+
scale_fill_viridis_d(option = "D", aesthetics = "col", name = "Year Color")+
labs(title = "Seasonal Plot of Relative Search Volume for years 2008 - 2018", x = " Month", y="Relative Search Volume")+theme(legend.position = "bottom")+theme_minimal()
b0 + plot_layout(ncol = 2) +b1
This plot represents the mean rsv and its standard deviation of each year.
skbt %>%
group_by(year) %>%
summarize(
mean_rsv = mean(rsv, na.rm = T),
sd_rsv = sd(rsv)) %>%
ggplot(aes(x =year, y=mean_rsv))+geom_point()+geom_smooth(formula = y ~ x + x^2, method = "auto", stat = "smooth")+
geom_errorbar(aes(ymin=mean_rsv-sd_rsv, ymax=mean_rsv+sd_rsv), width=.2,
position=position_dodge(.9))
This plot is the percent change in rsv from year to year eg: (the percent change between 2008 and 2009 is shown by the dot in 2008, likewise the percent change between 2017 and 2018 is shown in 2017 which is why 2018 does not have a value.
skbt %>%
group_by(year) %>%
summarize(
mean_rsv = mean(rsv, na.rm = T),
sd_rsv = sd(rsv)) %>%
mutate(
per_change = (lead(mean_rsv)-mean_rsv)/mean_rsv*100) %>% #,
#per_change2 = (lead(mean_rsv)-mean_rsv[1])/mean_rsv[1]*100) %>%
ggplot()+
geom_point(aes(x = year, y = per_change, colour = "red"))
This plot is the percent change in rsv for every year compared to 2008 eg: (the percent change between 2008 and 2009 is shown by the dot in 2008, likewise the percent change between 2008 and 2018 is shown in 2017 which is why 2018 does not have a value.
skbt %>%
group_by(year) %>%
summarize(
mean_rsv = mean(rsv, na.rm = T),
sd_rsv = sd(rsv)) %>%
mutate(
#per_change = (lead(mean_rsv)-mean_rsv)/mean_rsv*100) %>% #,
per_change2 = (lead(mean_rsv)-mean_rsv[1])/mean_rsv[1]*100) %>%
ggplot()+
geom_point(aes(x = year, y = per_change2, colour = "red"))#+
#geom_errorbar(aes(ymin=mean_rsv-sd_rsv, ymax=mean_rsv+sd_rsv, x = lead(year)), width=.2,
# position=position_dodge(.9))
# geom_point(aes(x = year, y = per_change))
Given the mean rsv seems to have a large variablilty across years, we decided to look that the median rsv.
skbt %>%
group_by(year) %>%
summarize(
med_rsv = median(rsv, na.rm = T),
IQR_m = IQR(rsv)) %>%
mutate(
per_change = (lead(med_rsv)-med_rsv)/med_rsv*100,
per_change2 = (lead(med_rsv)-med_rsv[1])/med_rsv[1]*100) %>%
#select(per_change, per_change2, m ) %>%
ggplot() +
geom_point(aes(x = year, y = med_rsv)) +
geom_errorbar(aes(ymin=med_rsv-IQR_m, ymax=med_rsv+IQR_m, x = year), width=.2,
position=position_dodge(.9))
#summary()
#try to calculate the median of each year then do percent change.
#look at timeseries papers to see how to explain the results
#summarize by year; median, IQR, with a line plot and dots.
#formatable table
#expand y axis to see more detail.
Here we observe the decomposed form of the temporal data. There seems to be an increasing trend and seasonal pattern.
skbt %>%
pull(rsv) %>%
ts(frequency = 12) %>%
na.omit() %>%
stl( s.window = "periodic") %>%
plot()
To statistically assess the data for a trend and seasonality we untilized the autocorellation function and produced a corelogram. In this plot we obaserve a positive test for a trend and seasonality being yearly. In addition the spearman’s rank correlation suggest a evidence of a trend p_value < 0.0001
skbt %>%
pull(rsv) %>%
ggAcf()
skbt %>%
pull(rsv) %>%
ts() %>%
pastecs::trend.test(R = 1) %>%
broom::tidy() %>%
select(method, estimate, p.value) %>%
rename("r" = estimate) %>%
formattable(align = c("l","c","c"))
| method | r | p.value |
|---|---|---|
| Spearman’s rank correlation rho | 0.4240577 | 4.043239e-07 |
skbt %>%
pull(rsv) %>%
ts(start = 2008, delta = 1/12,
# name = "year"
) %>%
na.omit() %>%
stl( s.window = "periodic") %>%
seasadj() %>%
ggsubseriesplot(main = "Skin Bleaching")
#skbt %>% as_tsibble(index = id) %>%
# select(year, rsv) %>%
#gg_season(rsv, period = "week")
#Loading in the data
skbr <- read.csv("./data/GtrendData/bleachingRegion08_18.csv", header = T, sep = ",",stringsAsFactors = F, col.names = c("region","rsv"))
skbr<- skbr[-1,] %>%
#removing duplicated column names
mutate(rsv = as.numeric(rsv)) %>%
rename(state = region)
The regional distribution is very odd.
skbr %>%
mutate(rsv = recode(rsv, .missing = 0)) %>%
ggplot(aes(x = rsv))+geom_histogram(bins = 30 )+labs(title = "Distribution of Regional Relative Search Intersts for year 2008 - 2018")
# used this data set to obtain state codes used in plotly to map according to state
codes <- read_csv("https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv") %>%
select(state, code)
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
#merging both the codes and rsv together within ploty commands using left_join.
p <- plot_geo(
#data set
left_join(codes, skbr, by = "state"),
#map
locationmode = 'USA-states') %>%
add_trace(
z = ~rsv,
#text = ~hover,
locations = ~code,
color = ~rsv, colors = 'Purples'
) %>%
colorbar(title = "Volume In Percentage") %>%
layout(
title = 'Relative Search Volume of <br> Skin Bleaching in The US 2008 - 2018',
geo = g
)
p
Table of states in the upperquartile of rsv (>=72%).
skbr %>%
filter(rsv >= 72) %>%
formattable(align = c("l", "c"))
| state | rsv |
|---|---|
| Maryland | 100 |
| New York | 94 |
| Georgia | 91 |
| Mississippi | 83 |
| Louisiana | 81 |
| Florida | 80 |
| Nevada | 78 |
| District of Columbia | 77 |